import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from glob import glob
from IPython.display import display
import time
%matplotlib inline
plt.style.available
#plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 350, 'figure.autolayout': False, 'figure.figsize': [15, 5], 'axes.labelsize': 10,\
'axes.titlesize': 18, 'font.size': 8, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 12,\
'xtick.labelsize': 12, 'ytick.labelsize': 12}
sns.set(style='dark',rc=rc)
np.set_printoptions(linewidth=120)
default_color = '#56B4E9'
colormap = plt.cm.cool
# Setting working directory
path = 'dados/raw/'
files = np.array(glob(path + "*"))
#read them into pandas
df_list = [pd.read_csv(file, na_values=-1) for file in files]
#concatenate them together
df = pd.concat(df_list,ignore_index=True)
df.shape
df.describe()
df.columns[:30]
df['ROCK TYPE'].value_counts()
TIPO DE ROCHA:
As rochas plutonica apresentam quÃmica semelhante aos seus pares vulcânicos, pois possuem mesma composição, modificando apenas o processo de formação. Plutonicas são formadas pela cristalização magmática em profundidade (interior da Terra) e vulcanicas são formadas pela cristalização magmática subáerea ou subáquosa. Portanto, essas rochas podem confundir o modelo.
As rochas Metamórficas, também podem possuir quÃmica semelhante as demias rochas, pois são produto da deformação de uma determinada rocha. Ex. Se uma rocha sedimentar é deformada ela se torna uma rocha metamórfia porem sua quimica continua semelhante, ou muito parecida com a quimica de seu protolito sedimentar. Portanto estas rochas também podem confundir o modelo.
As rochas sedimentares são formadas pela eroção das rochas, portanto, pode carregar caracteristicas quimicas de todos os tipos de rocha, podendo assim confundir o modelo
Os demais tipos de rocha não foram considerados pela sua ocorrência restrita na natureza e também pelo pequeno numero de amostras destes tipos no banco de dados.
df = df[df['ROCK TYPE'].isin(['VOL'])]
df = df[pd.isnull(df['ALTERATION'])]
target_names = df['ROCK NAME'].unique()
print 'Total de valores alvo unicos: %d' % len(target_names)
target_names.sort(axis=0)
target_names[:1000]
def remove_numbers_from_target(string):
end = string.find( ' [' )
result = string
if end != -1:
result = string[:end]
return result
df['ROCK NAME']
df['ROCK NAME'] = df['ROCK NAME'].apply(str).apply(remove_numbers_from_target)
df['ROCK NAME']
target_names = df['ROCK NAME'].unique()
print 'Total de valores alvo unicos: %d' % len(target_names)
Total de valores alvo diminuiram pois o valor numérico ( "[nnnnn]" ) duplicava valores unicos
df['ROCK NAME'] = df['ROCK NAME'].str.strip()
target_names.sort()
print '\n'.join([ e for e in target_names ])
target = df['ROCK NAME']
plt.figure(figsize=(15,5))
ax = df['ROCK NAME'].value_counts().plot(kind = 'bar', title= u'Número de ocorrências dos valores unicos do atributo classe (ROCK NAME)')
ax.grid(True)
#ax = sns.countplot('ROCK NAME',data=df,color=default_color, order = target.value_counts().index)
#ax.set(xticklabels=[])
ax.set_xlabel('ROCK NAME',fontsize= 15)
ax.set_ylabel('Contagem',fontsize= 15)
plt.savefig('imgs/Fig_1.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
plt.show()
target.value_counts()
target_lt_1 = [x for x in target.value_counts().index if target.value_counts()[x] >= 10]
with pd.option_context('display.max_rows', 200):
print (target.value_counts()[target_lt_1])
print len(target.value_counts()[target_lt_1])
try:
target_lt_1.remove('NOT GIVEN')
target_lt_1.remove('nan')
except:
pass
df = df[df['ROCK NAME'].isin(target_lt_1)]
df.shape
target = df['ROCK NAME']
plt.figure(figsize=(15,5))
plt.xticks(fontsize=9)
ax = df['ROCK NAME'].value_counts().plot(kind = 'bar', title= 'Numero de ocorrencias dos valores alvo "ROCK NAME"')
#ax = sns.countplot('ROCK NAME',data=df,color=default_color, order = target.value_counts().index)
#ax.set(xticklabels=[])
plt.show()
with pd.option_context('display.max_rows', 200):
print (target.value_counts()[target_lt_1])
print len(target.value_counts()[target_lt_1])
def get_meta(train, not_keep_list = []):
data = []
for col in df.columns:
# Defining the role
if col == 'ROCK NAME':
role = 'target'
elif col == 'id':
role = 'id'
else:
role = 'input'
# Defining the level
if train[col].dtype == np.float64:
level = 'interval'
elif train[col].dtype == np.int64:
level = 'ordinal'
else:
level = 'string'
# Initialize keep to True for all variables except for id
keep = True
if col == 'id':
keep = False
elif col in not_keep_list:
keep = False
# Defining the data type
dtype = train[col].dtype
# Creating a Dict that contains all the metadata for the variable
col_dict = {
'varname': col,
'role' : role,
'level' : level,
'keep' : keep,
'dtype' : dtype
}
data.append(col_dict)
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)
return meta
meta_data = get_meta(df)
with pd.option_context('display.max_rows', 200):
display(meta_data)
first_quim_analises_index = np.where(df.columns == 'SIO2(WT%)')[0][0]
remove_columns_top = df.columns[:first_quim_analises_index].values
index = np.where(remove_columns_top == 'ROCK NAME')[0][0] #Remover a coluna da variavel alvo da lista
remove_columns_top = np.delete(remove_columns_top, index)
remove_columns_top
last_quim_analises_index = np.where(df.columns == 'U(PPM)')[0][0]
remove_columns_bottom = df.columns[last_quim_analises_index + 1:].values
remove_columns_bottom
remove_columns = np.concatenate((remove_columns_top,remove_columns_bottom),axis=0)
remove_columns
meta_data = get_meta(df,remove_columns)
with pd.option_context('display.max_rows', 200):
display(meta_data)
meta_counts = meta_data[meta_data['keep'] == True].groupby(['role', 'level']).agg({'dtype': lambda x: x.count() }).reset_index()
meta_counts
col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index
col_target = meta_data[(meta_data.role == 'target')].index
col_string = meta_data[(meta_data.level == 'string')].index
display(col_interval.values)
display(col_target.values)
df[col_interval].shape
with pd.option_context('display.max_rows', 200):
display(df[col_interval].isnull().sum())
with pd.option_context('display.max_rows', 200):
display(df[col_target].isnull().sum())
msno.bar(df[col_interval],figsize=(20,8),color=default_color,fontsize=9,labels=True)
with pd.option_context('display.max_rows', 200):
display(df[col_interval].count())
no_data_cols = df[col_interval].count()[df[col_interval].count() == 0].index.values
no_data_cols
lt_1000_data_cols = df[col_interval].count()[(df[col_interval].count() < 1000) & (df[col_interval].count() > 0)].index.values
lt_1000_data_cols
remove_columns = np.concatenate((remove_columns,no_data_cols,lt_1000_data_cols), axis=0)
remove_columns
meta_data = get_meta(df,remove_columns)
with pd.option_context('display.max_rows', 200):
display(meta_data)
col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index
col_target = meta_data[(meta_data.role == 'target')].index
display(col_interval.values)
display(col_target.values)
with pd.option_context('display.max_rows', 200):
display(df[col_interval].count())
#msno.dendrogram(df[col_interval],figsize=(20,20),fontsize=10)
sorted_data = msno.nullity_sort(df[col_interval], sort='descending') # or sort='ascending'
ms_plt = msno.matrix(sorted_data,figsize=(20,10),fontsize=12,labels=True, inline=False, sparkline=False)
ms_plt.suptitle(u'Visualização de valores vazios no banco de dados', fontsize=18)
ms_plt.axes[0].tick_params(axis='y', labelsize=12)
ms_plt.savefig('imgs/Fig_2.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
sparse_data_columns = np.array(['CR2O3(WT%)','FE2O3(WT%)','FEO(WT%)',
'CO2(WT%)','S(WT%)','S(PPM)',
'GA(PPM)','AS(PPM)','PD(PPM)',
'SN(PPM)','CS(PPM)','W(PPM)', 'IR(PPM)',
'PT(PPM)','PB(PPM)'])
sparse_data_columns
remove_columns = np.concatenate((remove_columns,sparse_data_columns), axis=0)
remove_columns
meta_data = get_meta(df,remove_columns)
with pd.option_context('display.max_rows', 200):
display(meta_data)
col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index
col_target = meta_data[(meta_data.role == 'target')].index
display(col_interval.values)
display(col_target.values)
#msno.dendrogram(df[col_interval],figsize=(20,20),fontsize=10)
sorted_data = msno.nullity_sort(df[col_interval], sort='descending') # or sort='ascending'
msno.matrix(sorted_data,figsize=(20,10),fontsize=10,labels=True)
df = df.drop(df[col_interval][pd.isnull(df['SIO2(WT%)']) == True].index,axis=0)
df[col_interval].shape
df = df.drop(df[col_interval][df[col_interval].isnull().sum(axis=1) > df[col_interval].shape[1]*0.3].index,axis=0)
df[col_interval].shape
df['ROCK NAME'] = df['ROCK NAME'].apply(lambda x: x.rstrip())
#msno.dendrogram(df[col_interval],figsize=(20,20),fontsize=10)
sorted_data = msno.nullity_sort(df[col_interval], sort='descending') # or sort='ascending'
msno.matrix(sorted_data,figsize=(20,10),fontsize=10,labels=True)
msno.heatmap(df[col_interval],figsize=(20,10),cmap=colormap,labels=False, fontsize=10)
target = df['ROCK NAME']
plt.figure(figsize=(15,5))
plt.xticks(fontsize=9)
ax = df['ROCK NAME'].value_counts().plot(kind = 'bar', title= 'Numero de ocorrencias dos valores alvo "ROCK NAME"')
#ax = sns.countplot('ROCK NAME',data=df,color=default_color, order = target.value_counts().index)
#ax.set(xticklabels=[])
plt.show()
target_names = df['ROCK NAME'].unique()
print 'Total de valores alvo unicos: %d' % len(target_names)
target_names.sort()
print '\n'.join([ e for e in target_names ])
sns.set(font_scale=2)
plt.figure(figsize=(30,30))
plt.title('Pearson correlation of continuous features', y=1.05, size=30)
sns.heatmap(df[col_interval].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white',
annot=True, annot_kws={"size": 12})
#Correlação apenas das variaveis Intervalo , não se deve fazer em binarios ou nominais
plt.show()
Criado data frame com média dos valores agrupados pelo 'ROCK NAME'
df[col_interval].shape
df_mean_interval_gb_RN = df[np.concatenate((col_interval,col_target),axis=0)].groupby(['ROCK NAME']).mean()
df_mean_interval_gb_RN
Substituindo valores em branco pelas médias
for col, val in df[col_interval].isnull().sum().iteritems():
if val > 0:
RN_arr = df.loc[df[col][df[col].isnull()].index]['ROCK NAME']
for idx, rn in RN_arr.iteritems():
df.loc[idx,col] = df_mean_interval_gb_RN.loc[rn,col]
Visualizar os missing dos dados após a operação
#msno.dendrogram(df[col_interval],figsize=(20,20),fontsize=10)
sorted_data = msno.nullity_sort(df[col_interval], sort='descending') # or sort='ascending'
msno.matrix(sorted_data,figsize=(20,10),fontsize=10,labels=True)
df[col_interval].isnull().sum()
df = df.drop(df[col_interval][df[col_interval].isnull().sum(axis=1) > 0].index,axis=0)
df[col_interval].shape
target_gt_10 = [x for x in target.value_counts().index if target.value_counts()[x] >= 10]
with pd.option_context('display.max_rows', 200):
print (target.value_counts()[target_gt_10])
#print len(target.value_counts()[target_gt_10])
df = df[df['ROCK NAME'].isin(target_gt_10)]
df.shape
df[col_interval].shape
#msno.dendrogram(df[col_interval],figsize=(20,20),fontsize=10)
sorted_data = msno.nullity_sort(df[col_interval], sort='descending') # or sort='ascending'
ms_plt = msno.matrix(sorted_data,figsize=(20,10),fontsize=12,labels=True, inline=False, sparkline=False)
ms_plt.suptitle(u'Visualização de valores vazios no banco de dados', fontsize=18)
ms_plt.axes[0].tick_params(axis='y', labelsize=12)
ms_plt.savefig('imgs/Fig_3.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
target = df['ROCK NAME']
plt.figure(figsize=(15,5))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
ax = df['ROCK NAME'].value_counts().plot(kind = 'bar')
#ax = sns.countplot('ROCK NAME',data=df,color=default_color, order = target.value_counts().index)
#ax.set(xticklabels=[])
ax.set_title(u'Número de ocorrências dos valores unicos do atributo classe (ROCK NAME)',fontsize=18)
ax.set_xlabel('ROCK NAME',fontsize= 15)
ax.set_ylabel('Contagem',fontsize= 15)
plt.savefig('imgs/Fig_4.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
plt.show()
target_names = df['ROCK NAME'].unique()
print 'Total de valores alvo unicos: %d' % len(target_names)
target_names.sort()
print '\n'.join([ e for e in target_names ])
sns.set(font_scale=2)
plt.figure(figsize=(30,30))
plt.title(u'Correlação de Pearson dos atributos', y=1.01, size=30)
corr_plt = sns.heatmap(df[col_interval].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white',
annot=True, annot_kws={"size": 15}, fmt='.0%', cbar_kws={"shrink": .5, "fraction": 0.15})
#Correlação apenas das variaveis Intervalo , não se deve fazer em binarios ou nominais
corr_plt.figure.savefig('imgs/Fig_5.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
plt.show()
fig, axes = plt.subplots(len(col_interval)//5, 5, figsize=(15,20))
for col, axis in zip(col_interval, axes.flatten()):
df.hist(column = col, bins = 100, ax=axis, grid=False,
xlabelsize=10, ylabelsize=10)
axis.title.set_size(12)
#fig.show()
fig.savefig('imgs/Fig_6.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
from scipy import stats
df_bc = df.copy()
print 'Efetuando transformação Boxcox:'
for col in col_interval:
min_val = df_bc[col].min()
if min_val < 0:
df_bc[col] = df_bc[col].apply(lambda x: x - min_val + 0.00001)
elif min_val == 0:
df_bc[col] = df_bc[col].apply(lambda x: x + 0.00001)
bc, l = stats.boxcox(df_bc[col])
df_bc[col] = bc
print col, 'lambda = ', l
fig, axes = plt.subplots(len(col_interval)//5, 5, figsize=(15, 20))
for col, axis in zip(col_interval, axes.flatten()):
df_bc.hist(column = col, bins = 100, ax=axis, grid=False,
xlabelsize=10, ylabelsize=10)
axis.title.set_size(12)
fig.savefig('imgs/Fig_7.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=150,
max_depth=8,
min_samples_leaf=4,
max_features=0.2,
n_jobs=-1,
random_state=420)
rf.fit(df_bc[col_interval], df_bc[col_target])
def get_feature_importance_df(feature_importances,
column_names,
top_n=25):
"""Get feature importance data frame.
Parameters
----------
feature_importances : numpy ndarray
Feature importances computed by an ensemble
model like random forest or boosting
column_names : array-like
Names of the columns in the same order as feature
importances
top_n : integer
Number of top features
Returns
-------
df : a Pandas data frame
"""
imp_dict = dict(zip(column_names,
feature_importances))
top_features = sorted(imp_dict,
key=imp_dict.get,
reverse=True)[0:top_n]
top_importances = [imp_dict[feature] for feature
in top_features]
df = pd.DataFrame(data={'feature': top_features,
'importance': top_importances})
return df
feature_importance = get_feature_importance_df(rf.feature_importances_, col_interval, len(col_interval))
feature_importance
fig,ax = plt.subplots()
fig.set_size_inches(15,5)
sns.barplot(data=feature_importance[:50],x="feature",y="importance",ax=ax,color=default_color)
ax.set_title(u"Importância dos atributos", fontsize=18)
ax.set_xlabel('Atributo',fontsize=15)
ax.set_ylabel(u'Importância',fontsize=15)
ax.tick_params(axis='both', which='major', labelsize=12)
for item in ax.get_xticklabels():
item.set_rotation(90)
plt.savefig('imgs/Fig_8.png', bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
plt.show()
X = df_bc[col_interval]
y = df_bc[col_target]
y_factor = pd.factorize(y['ROCK NAME'])
y.loc[:,'ROCK NAME'] = y_factor[0]
rock_list = y_factor[1]
from sklearn.model_selection import StratifiedShuffleSplit
test_size = 0.2
seed = 420
split_indexes = StratifiedShuffleSplit(test_size=test_size, random_state=seed)
for train_index, test_index in split_indexes.split(X, y):
#print X[train_index]
#print test_index
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#X_train, X_test, y_train, y_test = StratifiedShuffleSplit(df[col_interval], df[col_target], )
X.shape
y.shape
X_train.shape
X_test.shape
y_train.shape
y_test.shape
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
def cross_val_model(X,y, model, n_splits=3):
X_ = np.array(X)
y_ = np.array(y)
y_ = y_.reshape(y_.shape[0],)
#print y_.shape
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2017)
j=0
for train_idx, test_idx in folds.split(X_, y_):
X_train_ = X_[train_idx]
y_train_ = y_[train_idx]
X_holdout = X_[test_idx]
y_holdout = y_[test_idx]
print ("Fit %s fold %d" % (str(model).split('(')[0], j+1))
model.fit(X_train_, y_train_)
cross_score = cross_val_score(model, X_holdout, y_holdout, cv=3, scoring='accuracy')
print(" cross_score: %.5f" % cross_score.mean())
def grid_cv(X,y, model, param_grid):
X_ = np.array(X)
y_ = np.array(y)
y_ = y_.reshape(y_.shape[0],)
#t0 = time()
clf = GridSearchCV(model, param_grid, verbose=2,n_jobs=5)
clf = clf.fit(X_, y_)
#print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print clf.best_estimator_
print clf.best_params_
print clf.best_score_
return clf.best_estimator_
def evaluate_classifiers(model, X_validation, y_validation):
X_ = np.array(X_validation)
y_ = np.array(y_validation)
y_ = y_.reshape(y_.shape[0],)
results = []
predictions = model.predict(X_)
acc = accuracy_score(y_, predictions)
cm = confusion_matrix(y_, predictions)
report = classification_report(y_, predictions)
results.append((model, acc, cm, report))
return results
def report_classifiers_results(model_results,target_list,figname=''):
for result in model_results:
name, acc, cm, report = result
print 'Model: %s' % name
print 'Accuracy: %f' % acc
print 'Cofusion Matrix: \n'
title = u'Matriz de confusão {}'.format(str(name).split('(')[0])
print_confusion_matrix(cm,target_list, (15,8),12,figname, title)
print 'Report:'
print report
print ' '
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=18,figname='',title=''):
df_cm = pd.DataFrame(
confusion_matrix, index=class_names, columns=class_names,
)
fig = plt.figure(figsize=figsize)
try:
heatmap = sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 12},
mask=(df_cm==0), linewidths=2, linecolor=(0.85,0.85,0.85),
cbar_kws={"shrink": 1, "fraction": 0.15})
except ValueError:
raise ValueError("Confusion matrix values must be integers.")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
plt.ylabel('Classe verdadeira',fontsize=18)
plt.xlabel('Classe prevista',fontsize=18)
if title == '':
plt.title(u'Matriz de confusão', y=1.01, size=22)
else:
plt.title(title, y=1.01, size=22)
if figname != '':
plt.savefig('imgs/{}.png'.format(figname), bbox_inches='tight', dpi=350, frameon=False,facecolor='w', edgecolor='w')
plt.show()
X_train_np = np.array(X_train)
y_train_np = np.array(y_train)
y_train_np = y_.reshape(y_.shape[0],)
svc_params = {}
svc_params['probability'] = True
svc_params['random_state'] = 420
svc_model = SVC(**svc_params)
%%time
cross_val_model(X_train, y_train, svc_model, 3)
start_time = time.time()
svc_model.fit(X_train_np, y_train_np)
svc_time = time.time() - start_time
print 'Train Time: %fs'%svc_time
svc_result = evaluate_classifiers(svc_model, X_test, y_test)
report_classifiers_results(svc_result,rock_list,'Fig_9')
knn_params = {}
knn_params['n_neighbors'] = range(3,20,2)
knn_params['weights'] = ['uniform','distance']
knn_params['p'] = [1,2]
knn_model = KNeighborsClassifier()
knn_model_gscv = grid_cv(X_train,y_train,knn_model,knn_params)
%%time
cross_val_model(X_train, y_train, knn_model_gscv, 3)
start_time = time.time()
knn_model_gscv.fit(X_train_np, y_train_np)
knn_time = time.time() - start_time
print 'Train Time: %fs'%knn_time
knn_result = evaluate_classifiers(knn_model_gscv, X_test, y_test)
report_classifiers_results(knn_result,rock_list,'Fig_10')
#RandomForest params
rf_params = {}
rf_params['n_estimators'] = range(100,500,50)
rf_params['max_depth'] = range(3,15,2)
rf_params['min_samples_split'] = range(10,50,5)
rf_params['min_samples_leaf'] = range(3,15,2)
rf_params['criterion'] = ['gini','entropy']
rf_params['random_state'] = [420]
rf_model = RandomForestClassifier()
rf_model_gscv = grid_cv(X_train,y_train,rf_model,rf_params)
%%time
cross_val_model(X_train, y_train, rf_model_gscv, 3)
start_time = time.time()
rf_model_gscv.fit(X_train_np, y_train_np)
rf_time = time.time() - start_time
print 'Train Time: %fs'%rf_time
rf_result = evaluate_classifiers(rf_model_gscv, X_test, y_test)
report_classifiers_results(rf_result,rock_list,'Fig_11')
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(20,500,10)
xgb_params['max_depth'] = [5]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model = XGBClassifier()
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
n_estimators = 390 encontrado.
Como fiz o step de 10 para o range (range(20,500,10)), agora vou refinar o n_estimators com step de 1 (range(380,400,1))
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(380,400,1)
xgb_params['max_depth'] = [5]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
n_estimators = 383 encontrado
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = range(1,10,2)
xgb_params['n_estimators'] = [383]
xgb_params['max_depth'] = range(3,20,2)
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
max_depth = 3, min_child_weight = 1 escolhidos
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = [383]
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [i/10.0 for i in range(0,10)]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
gamma = 0 escolhido
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(100,500,20)
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
Encontrado n_estimators = 480
Refinar Busca com step de 2
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(460,500,2)
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.8]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
n_estimators = 480 escolhido
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = [480]
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [i/10.0 for i in range(6,10)]
xgb_params['colsample_bytree'] = [i/10.0 for i in range(6,10)]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
colsample_bytree = 0.9, subsample = 0.8 escolhidos
xgb_params = {}
xgb_params['learning_rate'] = [0.1]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = [480]
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.9]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_params['reg_alpha'] = [0, 1e-5, 1e-2, 0.1, 1, 100]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
reg_alpha = 0 encontrado
xgb_params = {}
xgb_params['learning_rate'] = [0.01]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(480,2030,50)
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.9]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_params['reg_alpha'] = [0]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
n_estimators = 1830
Refinando step de 50 para 10
xgb_params = {}
xgb_params['learning_rate'] = [0.01]
xgb_params['min_child_weight'] = [1]
xgb_params['n_estimators'] = range(1780,1880,10)
xgb_params['max_depth'] = [3]
xgb_params['subsample'] = [0.8]
xgb_params['colsample_bytree'] = [0.9]
xgb_params['gamma'] = [0]
xgb_params['scale_pos_weight'] = [1]
xgb_params['reg_alpha'] = [0]
xgb_model_gscv = grid_cv(X_train,y_train,xgb_model,xgb_params)
n_estimators = 1840 encontrado
%%time
cross_val_model(X_train, y_train, xgb_model_gscv, 3)
start_time = time.time()
xgb_model_gscv.fit(X_train_np, y_train_np)
xgb_time = time.time() - start_time
print 'Train Time: %fs'%xgb_time
xgb_result = evaluate_classifiers(xgb_model_gscv, X_test, y_test)
report_classifiers_results(xgb_result,rock_list,'Fig_12')
model_accuracys = {}
model_accuracys['SVC'] = svc_result[0][1]
model_accuracys['KNN'] = knn_result[0][1]
model_accuracys['RF'] = rf_result[0][1]
model_accuracys['XGB'] = xgb_result[0][1]
model_accuracys_df = pd.DataFrame(model_accuracys, index=['Acurracy'])
model_cv_time = {}
model_cv_time['SVC'] = svc_time
model_cv_time['KNN'] = knn_time
model_cv_time['RF'] = rf_time
model_cv_time['XGB'] = xgb_time
model_cv_time_df = pd.DataFrame(model_cv_time, index=['Train Time (s)'])
model_results_df = model_accuracys_df.append(model_cv_time_df)
model_results_df.sort_values('Acurracy',axis=1)
log_model = LogisticRegression()
xgb_model_ = xgb_result[0][0]
rf_model_ = rf_result[0][0]
knn_model_ = knn_result[0][0]
from mlxtend.classifier import StackingCVClassifier
stack_model_cvc = StackingCVClassifier(classifiers=[xgb_model_,rf_model_,knn_model_],
use_probas=True,
meta_classifier=log_model,
cv = 3,
stratify=True,
shuffle=True
)
%%time
cross_val_model(X_train, y_train, stack_model_cvc, 2)
start_time = time.time()
stack_model_cvc.fit(X_train_np, y_train_np)
stack_time = time.time() - start_time
print 'Train Time: %fs'%stack_time
stack_result = evaluate_classifiers(stack_model_cvc, X_test, y_test)
report_classifiers_results(stack_result,rock_list,'Fig_13')
model_accuracys = {}
model_accuracys['SVC'] = svc_result[0][1]
model_accuracys['KNN'] = knn_result[0][1]
model_accuracys['RF'] = rf_result[0][1]
model_accuracys['XGB'] = xgb_result[0][1]
model_accuracys['STACK'] = stack_result[0][1]
model_accuracys_df = pd.DataFrame(model_accuracys, index=['Acurracy'])
model_cv_time = {}
model_cv_time['SVC'] = svc_time
model_cv_time['KNN'] = knn_time
model_cv_time['RF'] = rf_time
model_cv_time['XGB'] = xgb_time
model_cv_time['STACK'] = stack_time
model_cv_time_df = pd.DataFrame(model_cv_time, index=['Train Time (s)'])
model_results_df = model_accuracys_df.append(model_cv_time_df)
model_results_df.sort_values('Acurracy',axis=1)